includes/Sanitizer.php

   1 <?php
   2
   3 /**
   4  * (X)HTML sanitizer for MediaWiki
   5  *
   6  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   7  * http://www.mediawiki.org/
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with this program; if not, write to the Free Software Foundation, Inc.,
  21  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  22  * http://www.gnu.org/copyleft/gpl.html
  23  *
  24  * @package MediaWiki
  25  * @subpackage Parser
  26  */
  27
  28 /**
  29  * Regular expression to match various types of character references in
  30  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  31  */
  32 define( 'MW_CHAR_REFS_REGEX',
  33         '/&([A-Za-z0-9]+);
  34          |&\#([0-9]+);
  35          |&\#x([0-9A-Za-z]+);
  36          |&\#X([0-9A-Za-z]+);
  37          |(&)/x' );
  38
  39 /**
  40  * Regular expression to match HTML/XML attribute pairs within a tag.
  41  * Allows some... latitude.
  42  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  43  */
  44 $attrib = '[A-Za-z0-9]';
  45 $space = '[\x09\x0a\x0d\x20]';
  46 define( 'MW_ATTRIBS_REGEX',
  47         "/(?:^|$space)($attrib+)
  48           ($space*=$space*
  49                 (?:
  50                  # The attribute value: quoted or alone
  51                   \"([^<\"]*)\"
  52                  | '([^<']*)'
  53                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  54                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  55                                                          # colors are specified like this.
  56                                                          # We'll be normalizing it.
  57                 )
  58            )?(?=$space|\$)/sx" );
  59
  60 /**
  61  * List of all named character entities defined in HTML 4.01
  62  * http://www.w3.org/TR/html4/sgml/entities.html
  63  * @access private
  64  */
  65 global $wgHtmlEntities;
  66 $wgHtmlEntities = array(
  67         'Aacute'   => 193,
  68         'aacute'   => 225,
  69         'Acirc'    => 194,
  70         'acirc'    => 226,
  71         'acute'    => 180,
  72         'AElig'    => 198,
  73         'aelig'    => 230,
  74         'Agrave'   => 192,
  75         'agrave'   => 224,
  76         'alefsym'  => 8501,
  77         'Alpha'    => 913,
  78         'alpha'    => 945,
  79         'amp'      => 38,
  80         'and'      => 8743,
  81         'ang'      => 8736,
  82         'Aring'    => 197,
  83         'aring'    => 229,
  84         'asymp'    => 8776,
  85         'Atilde'   => 195,
  86         'atilde'   => 227,
  87         'Auml'     => 196,
  88         'auml'     => 228,
  89         'bdquo'    => 8222,
  90         'Beta'     => 914,
  91         'beta'     => 946,
  92         'brvbar'   => 166,
  93         'bull'     => 8226,
  94         'cap'      => 8745,
  95         'Ccedil'   => 199,
  96         'ccedil'   => 231,
  97         'cedil'    => 184,
  98         'cent'     => 162,
  99         'Chi'      => 935,
 100         'chi'      => 967,
 101         'circ'     => 710,
 102         'clubs'    => 9827,
 103         'cong'     => 8773,
 104         'copy'     => 169,
 105         'crarr'    => 8629,
 106         'cup'      => 8746,
 107         'curren'   => 164,
 108         'dagger'   => 8224,
 109         'Dagger'   => 8225,
 110         'darr'     => 8595,
 111         'dArr'     => 8659,
 112         'deg'      => 176,
 113         'Delta'    => 916,
 114         'delta'    => 948,
 115         'diams'    => 9830,
 116         'divide'   => 247,
 117         'Eacute'   => 201,
 118         'eacute'   => 233,
 119         'Ecirc'    => 202,
 120         'ecirc'    => 234,
 121         'Egrave'   => 200,
 122         'egrave'   => 232,
 123         'empty'    => 8709,
 124         'emsp'     => 8195,
 125         'ensp'     => 8194,
 126         'Epsilon'  => 917,
 127         'epsilon'  => 949,
 128         'equiv'    => 8801,
 129         'Eta'      => 919,
 130         'eta'      => 951,
 131         'ETH'      => 208,
 132         'eth'      => 240,
 133         'Euml'     => 203,
 134         'euml'     => 235,
 135         'euro'     => 8364,
 136         'exist'    => 8707,
 137         'fnof'     => 402,
 138         'forall'   => 8704,
 139         'frac12'   => 189,
 140         'frac14'   => 188,
 141         'frac34'   => 190,
 142         'frasl'    => 8260,
 143         'Gamma'    => 915,
 144         'gamma'    => 947,
 145         'ge'       => 8805,
 146         'gt'       => 62,
 147         'harr'     => 8596,
 148         'hArr'     => 8660,
 149         'hearts'   => 9829,
 150         'hellip'   => 8230,
 151         'Iacute'   => 205,
 152         'iacute'   => 237,
 153         'Icirc'    => 206,
 154         'icirc'    => 238,
 155         'iexcl'    => 161,
 156         'Igrave'   => 204,
 157         'igrave'   => 236,
 158         'image'    => 8465,
 159         'infin'    => 8734,
 160         'int'      => 8747,
 161         'Iota'     => 921,
 162         'iota'     => 953,
 163         'iquest'   => 191,
 164         'isin'     => 8712,
 165         'Iuml'     => 207,
 166         'iuml'     => 239,
 167         'Kappa'    => 922,
 168         'kappa'    => 954,
 169         'Lambda'   => 923,
 170         'lambda'   => 955,
 171         'lang'     => 9001,
 172         'laquo'    => 171,
 173         'larr'     => 8592,
 174         'lArr'     => 8656,
 175         'lceil'    => 8968,
 176         'ldquo'    => 8220,
 177         'le'       => 8804,
 178         'lfloor'   => 8970,
 179         'lowast'   => 8727,
 180         'loz'      => 9674,
 181         'lrm'      => 8206,
 182         'lsaquo'   => 8249,
 183         'lsquo'    => 8216,
 184         'lt'       => 60,
 185         'macr'     => 175,
 186         'mdash'    => 8212,
 187         'micro'    => 181,
 188         'middot'   => 183,
 189         'minus'    => 8722,
 190         'Mu'       => 924,
 191         'mu'       => 956,
 192         'nabla'    => 8711,
 193         'nbsp'     => 160,
 194         'ndash'    => 8211,
 195         'ne'       => 8800,
 196         'ni'       => 8715,
 197         'not'      => 172,
 198         'notin'    => 8713,
 199         'nsub'     => 8836,
 200         'Ntilde'   => 209,
 201         'ntilde'   => 241,
 202         'Nu'       => 925,
 203         'nu'       => 957,
 204         'Oacute'   => 211,
 205         'oacute'   => 243,
 206         'Ocirc'    => 212,
 207         'ocirc'    => 244,
 208         'OElig'    => 338,
 209         'oelig'    => 339,
 210         'Ograve'   => 210,
 211         'ograve'   => 242,
 212         'oline'    => 8254,
 213         'Omega'    => 937,
 214         'omega'    => 969,
 215         'Omicron'  => 927,
 216         'omicron'  => 959,
 217         'oplus'    => 8853,
 218         'or'       => 8744,
 219         'ordf'     => 170,
 220         'ordm'     => 186,
 221         'Oslash'   => 216,
 222         'oslash'   => 248,
 223         'Otilde'   => 213,
 224         'otilde'   => 245,
 225         'otimes'   => 8855,
 226         'Ouml'     => 214,
 227         'ouml'     => 246,
 228         'para'     => 182,
 229         'part'     => 8706,
 230         'permil'   => 8240,
 231         'perp'     => 8869,
 232         'Phi'      => 934,
 233         'phi'      => 966,
 234         'Pi'       => 928,
 235         'pi'       => 960,
 236         'piv'      => 982,
 237         'plusmn'   => 177,
 238         'pound'    => 163,
 239         'prime'    => 8242,
 240         'Prime'    => 8243,
 241         'prod'     => 8719,
 242         'prop'     => 8733,
 243         'Psi'      => 936,
 244         'psi'      => 968,
 245         'quot'     => 34,
 246         'radic'    => 8730,
 247         'rang'     => 9002,
 248         'raquo'    => 187,
 249         'rarr'     => 8594,
 250         'rArr'     => 8658,
 251         'rceil'    => 8969,
 252         'rdquo'    => 8221,
 253         'real'     => 8476,
 254         'reg'      => 174,
 255         'rfloor'   => 8971,
 256         'Rho'      => 929,
 257         'rho'      => 961,
 258         'rlm'      => 8207,
 259         'rsaquo'   => 8250,
 260         'rsquo'    => 8217,
 261         'sbquo'    => 8218,
 262         'Scaron'   => 352,
 263         'scaron'   => 353,
 264         'sdot'     => 8901,
 265         'sect'     => 167,
 266         'shy'      => 173,
 267         'Sigma'    => 931,
 268         'sigma'    => 963,
 269         'sigmaf'   => 962,
 270         'sim'      => 8764,
 271         'spades'   => 9824,
 272         'sub'      => 8834,
 273         'sube'     => 8838,
 274         'sum'      => 8721,
 275         'sup'      => 8835,
 276         'sup1'     => 185,
 277         'sup2'     => 178,
 278         'sup3'     => 179,
 279         'supe'     => 8839,
 280         'szlig'    => 223,
 281         'Tau'      => 932,
 282         'tau'      => 964,
 283         'there4'   => 8756,
 284         'Theta'    => 920,
 285         'theta'    => 952,
 286         'thetasym' => 977,
 287         'thinsp'   => 8201,
 288         'THORN'    => 222,
 289         'thorn'    => 254,
 290         'tilde'    => 732,
 291         'times'    => 215,
 292         'trade'    => 8482,
 293         'Uacute'   => 218,
 294         'uacute'   => 250,
 295         'uarr'     => 8593,
 296         'uArr'     => 8657,
 297         'Ucirc'    => 219,
 298         'ucirc'    => 251,
 299         'Ugrave'   => 217,
 300         'ugrave'   => 249,
 301         'uml'      => 168,
 302         'upsih'    => 978,
 303         'Upsilon'  => 933,
 304         'upsilon'  => 965,
 305         'Uuml'     => 220,
 306         'uuml'     => 252,
 307         'weierp'   => 8472,
 308         'Xi'       => 926,
 309         'xi'       => 958,
 310         'Yacute'   => 221,
 311         'yacute'   => 253,
 312         'yen'      => 165,
 313         'Yuml'     => 376,
 314         'yuml'     => 255,
 315         'Zeta'     => 918,
 316         'zeta'     => 950,
 317         'zwj'      => 8205,
 318         'zwnj'     => 8204 );
 319
 320 class Sanitizer {
 321         /**
 322          * Cleans up HTML, removes dangerous tags and attributes, and
 323          * removes HTML comments
 324          * @access private
 325          * @param string $text
 326          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 327          * @param array $args for the processing callback
 328          * @return string
 329          */
 330         function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 331                 global $wgUseTidy, $wgUserHtml;
 332                 $fname = 'Parser::removeHTMLtags';
 333                 wfProfileIn( $fname );
 334
 335                 if( $wgUserHtml ) {
 336                         $htmlpairs = array( # Tags that must be closed
 337                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 338                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 339                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 340                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 341                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
 342                         );
 343                         $htmlsingle = array(
 344                                 'br', 'hr', 'li', 'dt', 'dd'
 345                         );
 346                         $htmlnest = array( # Tags that can be nested--??
 347                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 348                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 349                         );
 350                         $tabletags = array( # Can only appear inside table
 351                                 'td', 'th', 'tr'
 352                         );
 353                 } else {
 354                         $htmlpairs = array();
 355                         $htmlsingle = array();
 356                         $htmlnest = array();
 357                         $tabletags = array();
 358                 }
 359
 360                 $htmlsingle = array_merge( $tabletags, $htmlsingle );
 361                 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
 362
 363                 # Remove HTML comments
 364                 $text = Sanitizer::removeHTMLcomments( $text );
 365
 366                 $bits = explode( '<', $text );
 367                 $text = array_shift( $bits );
 368                 if(!$wgUseTidy) {
 369                         $tagstack = array(); $tablestack = array();
 370                         foreach ( $bits as $x ) {
 371                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
 372                                 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
 373                                 $x, $regs );
 374                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 375                                 error_reporting( $prev );
 376
 377                                 $badtag = 0 ;
 378                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 379                                         # Check our stack
 380                                         if ( $slash ) {
 381                                                 # Closing a tag...
 382                                                 if ( ! in_array( $t, $htmlsingle ) &&
 383                                                 ( $ot = @array_pop( $tagstack ) ) != $t ) {
 384                                                         @array_push( $tagstack, $ot );
 385                                                         $badtag = 1;
 386                                                 } else {
 387                                                         if ( $t == 'table' ) {
 388                                                                 $tagstack = array_pop( $tablestack );
 389                                                         }
 390                                                         $newparams = '';
 391                                                 }
 392                                         } else {
 393                                                 # Keep track for later
 394                                                 if ( in_array( $t, $tabletags ) &&
 395                                                 ! in_array( 'table', $tagstack ) ) {
 396                                                         $badtag = 1;
 397                                                 } else if ( in_array( $t, $tagstack ) &&
 398                                                 ! in_array ( $t , $htmlnest ) ) {
 399                                                         $badtag = 1 ;
 400                                                 } else if ( ! in_array( $t, $htmlsingle ) ) {
 401                                                         if ( $t == 'table' ) {
 402                                                                 array_push( $tablestack, $tagstack );
 403                                                                 $tagstack = array();
 404                                                         }
 405                                                         array_push( $tagstack, $t );
 406                                                 }
 407
 408                                                 # Replace any variables or template parameters with
 409                                                 # plaintext results.
 410                                                 if( is_callable( $processCallback ) ) {
 411                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 412                                                 }
 413
 414                                                 # Strip non-approved attributes from the tag
 415                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 416                                         }
 417                                         if ( ! $badtag ) {
 418                                                 $rest = str_replace( '>', '&gt;', $rest );
 419                                                 $text .= "<$slash$t$newparams$brace$rest";
 420                                                 continue;
 421                                         }
 422                                 }
 423                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 424                         }
 425                         # Close off any remaining tags
 426                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 427                                 $text .= "</$t>\n";
 428                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 429                         }
 430                 } else {
 431                         # this might be possible using tidy itself
 432                         foreach ( $bits as $x ) {
 433                                 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
 434                                 $x, $regs );
 435                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 436                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 437                                         if( is_callable( $processCallback ) ) {
 438                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 439                                         }
 440                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 441                                         $rest = str_replace( '>', '&gt;', $rest );
 442                                         $text .= "<$slash$t$newparams$brace$rest";
 443                                 } else {
 444                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 445                                 }
 446                         }
 447                 }
 448                 wfProfileOut( $fname );
 449                 return $text;
 450         }
 451
 452         /**
 453          * Remove '<!--', '-->', and everything between.
 454          * To avoid leaving blank lines, when a comment is both preceded
 455          * and followed by a newline (ignoring spaces), trim leading and
 456          * trailing spaces and one of the newlines.
 457          *
 458          * @access private
 459          * @param string $text
 460          * @return string
 461          */
 462         function removeHTMLcomments( $text ) {
 463                 $fname='Parser::removeHTMLcomments';
 464                 wfProfileIn( $fname );
 465                 while (($start = strpos($text, '<!--')) !== false) {
 466                         $end = strpos($text, '-->', $start + 4);
 467                         if ($end === false) {
 468                                 # Unterminated comment; bail out
 469                                 break;
 470                         }
 471
 472                         $end += 3;
 473
 474                         # Trim space and newline if the comment is both
 475                         # preceded and followed by a newline
 476                         $spaceStart = max($start - 1, 0);
 477                         $spaceLen = $end - $spaceStart;
 478                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 479                                 $spaceStart--;
 480                                 $spaceLen++;
 481                         }
 482                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 483                                 $spaceLen++;
 484                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 485                                 # Remove the comment, leading and trailing
 486                                 # spaces, and leave only one newline.
 487                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 488                         }
 489                         else {
 490                                 # Remove just the comment.
 491                                 $text = substr_replace($text, '', $start, $end - $start);
 492                         }
 493                 }
 494                 wfProfileOut( $fname );
 495                 return $text;
 496         }
 497
 498         /**
 499          * Take a tag soup fragment listing an HTML element's attributes
 500          * and normalize it to well-formed XML, discarding unwanted attributes.
 501          *
 502          * - Normalizes attribute names to lowercase
 503          * - Discards attributes not on a whitelist for the given element
 504          * - Turns broken or invalid entities into plaintext
 505          * - Double-quotes all attribute values
 506          * - Attributes without values are given the name as attribute
 507          * - Double attributes are discarded
 508          * - Unsafe style attributes are discarded
 509          * - Prepends space if there are attributes.
 510          *
 511          * @param string $text
 512          * @param string $element
 513          * @return string
 514          *
 515          * @todo Check for legal values where the DTD limits things.
 516          * @todo Check for unique id attribute :P
 517          */
 518         function fixTagAttributes( $text, $element ) {
 519                 if( trim( $text ) == '' ) {
 520                         return '';
 521                 }
 522
 523                 # Unquoted attribute
 524                 # Since we quote this later, this can be anything distinguishable
 525                 # from the end of the attribute
 526                 if( !preg_match_all(
 527                         MW_ATTRIBS_REGEX,
 528                         $text,
 529                         $pairs,
 530                         PREG_SET_ORDER ) ) {
 531                         return '';
 532                 }
 533
 534                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 535                 $attribs = array();
 536                 foreach( $pairs as $set ) {
 537                         $attribute = strtolower( $set[1] );
 538                         if( !isset( $whitelist[$attribute] ) ) {
 539                                 continue;
 540                         }
 541
 542                         $raw   = Sanitizer::getTagAttributeCallback( $set );
 543                         $value = Sanitizer::normalizeAttributeValue( $raw );
 544
 545                         # Strip javascript "expression" from stylesheets.
 546                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 547                         if( $attribute == 'style' && preg_match(
 548                                 '/(expression|tps*:\/\/|url\\s*\().*/is',
 549                                         Sanitizer::decodeCharReferences( $value ) ) ) {
 550                                 # haxx0r
 551                                 continue;
 552                         }
 553
 554                         # Templates and links may be expanded in later parsing,
 555                         # creating invalid or dangerous output. Suppress this.
 556                         $value = strtr( $value, array(
 557                                 '{'    => '&#123;',
 558                                 '['    => '&#91;',
 559                                 "''"   => '&#39;&#39;',
 560                                 'ISBN' => '&#73;SBN',
 561                                 'RFC'  => '&#82;FC',
 562                                 'PMID' => '&#80;MID',
 563                         ) );
 564                         $value = preg_replace(
 565                                 '/(' . URL_PROTOCOLS . '):/',
 566                                 '\\1&#58;', $value );
 567
 568                         if( !isset( $attribs[$attribute] ) ) {
 569                                 $attribs[$attribute] = "$attribute=\"$value\"";
 570                         }
 571                 }
 572                 if( empty( $attribs ) ) {
 573                         return '';
 574                 } else {
 575                         return ' ' . implode( ' ', $attribs );
 576                 }
 577         }
 578
 579         /**
 580          * Return an associative array of attribute names and values from
 581          * a partial tag string. Attribute names are forces to lowercase,
 582          * character references are decoded to UTF-8 text.
 583          *
 584          * @param string
 585          * @return array
 586          */
 587         function decodeTagAttributes( $text ) {
 588                 $attribs = array();
 589
 590                 if( trim( $text ) == '' ) {
 591                         return $attribs;
 592                 }
 593
 594                 if( !preg_match_all(
 595                         MW_ATTRIBS_REGEX,
 596                         $text,
 597                         $pairs,
 598                         PREG_SET_ORDER ) ) {
 599                         return $attribs;
 600                 }
 601
 602                 foreach( $pairs as $set ) {
 603                         $attribute = strtolower( $set[1] );
 604                         $value = Sanitizer::getTagAttributeCallback( $set );
 605                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 606                 }
 607                 return $attribs;
 608         }
 609
 610         /**
 611          * Pick the appropriate attribute value from a match set from the
 612          * MW_ATTRIBS_REGEX matches.
 613          *
 614          * @param array $set
 615          * @return string
 616          * @access private
 617          */
 618         function getTagAttributeCallback( $set ) {
 619                 if( isset( $set[6] ) ) {
 620                         # Illegal #XXXXXX color with no quotes.
 621                         return $set[6];
 622                 } elseif( isset( $set[5] ) ) {
 623                         # No quotes.
 624                         return $set[5];
 625                 } elseif( isset( $set[4] ) ) {
 626                         # Single-quoted
 627                         return $set[4];
 628                 } elseif( isset( $set[3] ) ) {
 629                         # Double-quoted
 630                         return $set[3];
 631                 } elseif( !isset( $set[2] ) ) {
 632                         # In XHTML, attributes must have a value.
 633                         # For 'reduced' form, return explicitly the attribute name here.
 634                         return $set[1];
 635                 } else {
 636                         wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
 637                 }
 638         }
 639
 640         /**
 641          * Normalize whitespace and character references in an XML source-
 642          * encoded text for an attribute value.
 643          *
 644          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 645          * but note that we're not returning the value, but are returning
 646          * XML source fragments that will be slapped into output.
 647          *
 648          * @param string $text
 649          * @return string
 650          * @access private
 651          */
 652         function normalizeAttributeValue( $text ) {
 653                 return str_replace( '"', '&quot;',
 654                         preg_replace(
 655                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 656                                 ' ',
 657                                 Sanitizer::normalizeCharReferences( $text ) ) );
 658         }
 659
 660         /**
 661          * Ensure that any entities and character references are legal
 662          * for XML and XHTML specifically. Any stray bits will be
 663          * &amp;-escaped to result in a valid text fragment.
 664          *
 665          * a. any named char refs must be known in XHTML
 666          * b. any numeric char refs must be legal chars, not invalid or forbidden
 667          * c. use &#x, not &#X
 668          * d. fix or reject non-valid attributes
 669          *
 670          * @param string $text
 671          * @return string
 672          * @access private
 673          */
 674         function normalizeCharReferences( $text ) {
 675                 return preg_replace_callback(
 676                         MW_CHAR_REFS_REGEX,
 677                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 678                         $text );
 679         }
 680         /**
 681          * @param string $matches
 682          * @return string
 683          */
 684         function normalizeCharReferencesCallback( $matches ) {
 685                 $ret = null;
 686                 if( $matches[1] != '' ) {
 687                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 688                 } elseif( $matches[2] != '' ) {
 689                         $ret = Sanitizer::decCharReference( $matches[2] );
 690                 } elseif( $matches[3] != ''  ) {
 691                         $ret = Sanitizer::hexCharReference( $matches[3] );
 692                 } elseif( $matches[4] != '' ) {
 693                         $ret = Sanitizer::hexCharReference( $matches[4] );
 694                 }
 695                 if( is_null( $ret ) ) {
 696                         return htmlspecialchars( $matches[0] );
 697                 } else {
 698                         return $ret;
 699                 }
 700         }
 701
 702         /**
 703          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 704          * return the named entity reference as is. Otherwise, returns
 705          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 706          *
 707          * @param string $name
 708          * @return string
 709          */
 710         function normalizeEntity( $name ) {
 711                 global $wgHtmlEntities;
 712                 if( isset( $wgHtmlEntities[$name] ) ) {
 713                         return "&$name;";
 714                 } else {
 715                         return "&amp;$name;";
 716                 }
 717         }
 718
 719         function decCharReference( $codepoint ) {
 720                 $point = IntVal( $codepoint );
 721                 if( Sanitizer::validateCodepoint( $point ) ) {
 722                         return sprintf( '&#%d;', $point );
 723                 } else {
 724                         return null;
 725                 }
 726         }
 727
 728         function hexCharReference( $codepoint ) {
 729                 $point = hexdec( $codepoint );
 730                 if( Sanitizer::validateCodepoint( $point ) ) {
 731                         return sprintf( '&#x%x;', $point );
 732                 } else {
 733                         return null;
 734                 }
 735         }
 736
 737         /**
 738          * Returns true if a given Unicode codepoint is a valid character in XML.
 739          * @param int $codepoint
 740          * @return bool
 741          */
 742         function validateCodepoint( $codepoint ) {
 743                 return ($codepoint ==    0x09)
 744                         || ($codepoint ==    0x0a)
 745                         || ($codepoint ==    0x0d)
 746                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 747                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 748                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 749         }
 750
 751         /**
 752          * Decode any character references, numeric or named entities,
 753          * in the text and return a UTF-8 string.
 754          *
 755          * @param string $text
 756          * @return string
 757          * @access public
 758          */
 759         function decodeCharReferences( $text ) {
 760                 return preg_replace_callback(
 761                         MW_CHAR_REFS_REGEX,
 762                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 763                         $text );
 764         }
 765
 766         /**
 767          * @param string $matches
 768          * @return string
 769          */
 770         function decodeCharReferencesCallback( $matches ) {
 771                 if( $matches[1] != '' ) {
 772                         return Sanitizer::decodeEntity( $matches[1] );
 773                 } elseif( $matches[2] != '' ) {
 774                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 775                 } elseif( $matches[3] != ''  ) {
 776                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 777                 } elseif( $matches[4] != '' ) {
 778                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 779                 }
 780                 # Last case should be an ampersand by itself
 781                 return $matches[0];
 782         }
 783
 784         /**
 785          * Return UTF-8 string for a codepoint if that is a valid
 786          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 787          * @param int $codepoint
 788          * @return string
 789          * @access private
 790          */
 791         function decodeChar( $codepoint ) {
 792                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 793                         return codepointToUtf8( $codepoint );
 794                 } else {
 795                         return UTF8_REPLACEMENT;
 796                 }
 797         }
 798
 799         /**
 800          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 801          * return the UTF-8 encoding of that character. Otherwise, returns
 802          * pseudo-entity source (eg &foo;)
 803          *
 804          * @param string $name
 805          * @return string
 806          */
 807         function decodeEntity( $name ) {
 808                 global $wgHtmlEntities;
 809                 if( isset( $wgHtmlEntities[$name] ) ) {
 810                         return codepointToUtf8( $wgHtmlEntities[$name] );
 811                 } else {
 812                         return "&$name;";
 813                 }
 814         }
 815
 816         /**
 817          * Fetch the whitelist of acceptable attributes for a given
 818          * element name.
 819          *
 820          * @param string $element
 821          * @return array
 822          */
 823         function attributeWhitelist( $element ) {
 824                 static $list;
 825                 if( !isset( $list ) ) {
 826                         $list = Sanitizer::setupAttributeWhitelist();
 827                 }
 828                 return isset( $list[$element] )
 829                         ? $list[$element]
 830                         : array();
 831         }
 832
 833         /**
 834          * @return array
 835          */
 836         function setupAttributeWhitelist() {
 837                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 838                 $block = array_merge( $common, array( 'align' ) );
 839                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
 840                 $tablecell = array( 'abbr',
 841                                     'axis',
 842                                     'headers',
 843                                     'scope',
 844                                     'rowspan',
 845                                     'colspan',
 846                                     'nowrap', # deprecated
 847                                     'width',  # deprecated
 848                                     'height', # deprecated
 849                                     'bgcolor' # deprecated
 850                                     );
 851
 852                 # Numbers refer to sections in HTML 4.01 standard describing the element.
 853                 # See: http://www.w3.org/TR/html4/
 854                 $whitelist = array (
 855                         # 7.5.4
 856                         'div'        => $block,
 857                         'center'     => $common, # deprecated
 858                         'span'       => $block, # ??
 859
 860                         # 7.5.5
 861                         'h1'         => $block,
 862                         'h2'         => $block,
 863                         'h3'         => $block,
 864                         'h4'         => $block,
 865                         'h5'         => $block,
 866                         'h6'         => $block,
 867
 868                         # 7.5.6
 869                         # address
 870
 871                         # 8.2.4
 872                         # bdo
 873
 874                         # 9.2.1
 875                         'em'         => $common,
 876                         'strong'     => $common,
 877                         'cite'       => $common,
 878                         # dfn
 879                         'code'       => $common,
 880                         # samp
 881                         # kbd
 882                         'var'        => $common,
 883                         # abbr
 884                         # acronym
 885
 886                         # 9.2.2
 887                         'blockquote' => array_merge( $common, array( 'cite' ) ),
 888                         # q
 889
 890                         # 9.2.3
 891                         'sub'        => $common,
 892                         'sup'        => $common,
 893
 894                         # 9.3.1
 895                         'p'          => $block,
 896
 897                         # 9.3.2
 898                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
 899
 900                         # 9.3.4
 901                         'pre'        => array_merge( $common, array( 'width' ) ),
 902
 903                         # 9.4
 904                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 905                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 906
 907                         # 10.2
 908                         'ul'         => array_merge( $common, array( 'type' ) ),
 909                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
 910                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
 911
 912                         # 10.3
 913                         'dl'         => $common,
 914                         'dd'         => $common,
 915                         'dt'         => $common,
 916
 917                         # 11.2.1
 918                         'table'      => array_merge( $common,
 919                                                                 array( 'summary', 'width', 'border', 'frame',
 920                                                                                          'rules', 'cellspacing', 'cellpadding',
 921                                                                                          'align', 'bgcolor', 'frame', 'rules',
 922                                                                                          'border' ) ),
 923
 924                         # 11.2.2
 925                         'caption'    => array_merge( $common, array( 'align' ) ),
 926
 927                         # 11.2.3
 928                         'thead'      => array_merge( $common, $tablealign ),
 929                         'tfoot'      => array_merge( $common, $tablealign ),
 930                         'tbody'      => array_merge( $common, $tablealign ),
 931
 932                         # 11.2.4
 933                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 934                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 935
 936                         # 11.2.5
 937                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
 938
 939                         # 11.2.6
 940                         'td'         => array_merge( $common, $tablecell, $tablealign ),
 941                         'th'         => array_merge( $common, $tablecell, $tablealign ),
 942
 943                         # 15.2.1
 944                         'tt'         => $common,
 945                         'b'          => $common,
 946                         'i'          => $common,
 947                         'big'        => $common,
 948                         'small'      => $common,
 949                         'strike'     => $common,
 950                         's'          => $common,
 951                         'u'          => $common,
 952
 953                         # 15.2.2
 954                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
 955                         # basefont
 956
 957                         # 15.3
 958                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
 959
 960                         # XHTML Ruby annotation text module, simple ruby only.
 961                         # http://www.w3c.org/TR/ruby/
 962                         'ruby'       => $common,
 963                         # rbc
 964                         # rtc
 965                         'rb'         => $common,
 966                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
 967                         'rp'         => $common,
 968                         );
 969                 return $whitelist;
 970         }
 971
 972         /**
 973          * Take a fragment of (potentially invalid) HTML and return
 974          * a version with any tags removed, encoded suitably for literal
 975          * inclusion in an attribute value.
 976          *
 977          * @param string $text HTML fragment
 978          * @return string
 979          */
 980         function stripAllTags( $text ) {
 981                 # Actual <tags>
 982                 $text = preg_replace( '/<[^>]*>/', '', $text );
 983
 984                 # Normalize &entities and whitespace
 985                 $text = Sanitizer::normalizeAttributeValue( $text );
 986
 987                 # Will be placed into "double-quoted" attributes,
 988                 # make sure remaining bits are safe.
 989                 $text = str_replace(
 990                         array('<', '>', '"'),
 991                         array('&lt;', '&gt;', '&quot;'),
 992                         $text );
 993
 994                 return $text;
 995         }
 996
 997 }
 998
 999 ?>